Loading and preprocessing the data
# Required Packages:
# dplyr
# data.table
# ggplot2
# plotly
# Load compressed data:
raw.data <- read.csv(unzip('./activity.zip'))
# Process data by converting date to POSIX:
processed.data <- raw.data %>% mutate(
interval = sprintf('%04d',interval), # Pad leading zeros
dateTime = as.POSIXct(paste0(date,' ',interval), format="%Y-%m-%d %H%M"),
date=as.Date(date, format="%Y-%m-%d"),
)
What is mean total number of steps taken per day?
# Create clean data with NA values removed:
step.data <- processed.data %>% filter(!is.na(steps)) %>% group_by(date)
# Determine the Total Steps Taken Per Day:
total.steps <- step.data %>% summarize(Count = sum(steps))
# Determine Mean/Median of steps per day:
mean.steps <- mean(step.data$steps, na.rm=T)
median.steps <- median(step.data$steps, na.rm=T)
# Create Histogram showing total number of steps per day:
hist(total.steps$Count, xlab = "Steps Taken Per Day",
main="Histogram: Steps Taken Per Day")

cat(paste0("Mean Total Number of Steps Per Day: ", mean.steps))
## Mean Total Number of Steps Per Day: 37.3825995807128
cat(paste0("Median Total Number of Steps Per Day: ", median.steps))
## Median Total Number of Steps Per Day: 0
What is the average daily activity pattern?
# Determine the average steps per 5-minute interval over data set:
daily.data <- processed.data %>% filter(!is.na(steps)) %>% group_by(interval) %>%
summarize(average_steps = mean(steps))
# Plot the Time Series Plot on 5-minute interval for average steps taken:
plot_ly(daily.data, x=~interval, y=~average_steps, type='scatter',
mode='lines') %>%
layout(title="Average Daily Activity Pattern",
xaxis = list(title="Interval (5 minute)"),
yaxis = list(title="Average Steps"))
# Determine Which 5-minute interval contains maximum number of steps:
cat(paste0("Interval (5-min) Containing Max Steps: ",
daily.data$interval[which.max(daily.data$average_steps)]))
## Interval (5-min) Containing Max Steps: 0835
Imputing missing values
# Determine number of missing values:
missing <- sum(is.na(processed.data$steps)) # Note: Other fields didn't contain NA
cat(paste0("Total Number of Missing Values (NA): ", missing))
## Total Number of Missing Values (NA): 2304
# Strategy for filling in NA's: Use Mean value for 5-minute interval, since in
# certain cases the entire day is missing, making it difficult to interpolate
cat("To approximate missing values, use the mean value for the 5-minute interval")
## To approximate missing values, use the mean value for the 5-minute interval
# Create new dataset using strategy:
new.processed.data <- processed.data %>% mutate(
steps = ifelse(!is.na(steps),steps, # If not NA, keep original value
daily.data$average_steps[chmatch(interval, daily.data$interval)])
)
# Create new histogram of data showing steps per day after correction:
new.total.steps <- new.processed.data %>% group_by(date) %>% summarize(Count = sum(steps))
# Determine Mean/Median of steps per day after correction:
new.mean.steps <- mean(new.processed.data$steps)
new.median.steps <- median(new.processed.data$steps)
hist(new.total.steps$Count, xlab = "Steps Taken Per Day (Including Missing Correction)",
main="Histogram: Steps Taken Per Day")

cat(paste0("Mean Total Number of Steps Per Day (corrected): ", new.mean.steps))
## Mean Total Number of Steps Per Day (corrected): 37.3825995807128
cat(paste0("Median Total Number of Steps Per Day (corrected): ", new.median.steps))
## Median Total Number of Steps Per Day (corrected): 0
# Determine the impact of inputting missing data on total daily steps:
if (sum(new.total.steps$Count) > sum(total.steps$Count)){
cat(paste0("Inputting Missing Data INCREASED total number of daily steps: "),
"(",sum(new.total.steps$Count)," vs. ", sum(total.steps$Count),")")
} else if (sum(new.total.steps$Count) < sum(total.steps$Count)){
cat(paste0("Inputting Missing Data DECREASED total number of daily steps: "),
"(",sum(new.total.steps$Count)," vs. ", sum(total.steps$Count),")")
} else {
cat("Inputting Missing Data had no change on total number of daily steps")
}
## Inputting Missing Data INCREASED total number of daily steps: ( 656737.5 vs. 570608 )
Are there differences in activity patterns between weekdays and weekends?
new.processed.data <- new.processed.data %>% mutate(
weekday = weekdays(date),
w = case_when(grepl('Saturday|Sunday',weekday) ~ "Weekend",
TRUE ~ "Weekday")
)
p <- ggplot(new.processed.data, aes(x=interval, y=steps)) +
geom_line() +
# facet_grid(. ~ w, scales="free_x")
scale_x_discrete(breaks=new.processed.data$interval[c(T,rep(F,time=29))]) +
facet_wrap(~w, ncol=1) +
theme(axis.text.x = element_text(angle = -90, hjust = 1),
axis.title.x = element_blank())
ggplotly(p=p)